In [1]:
!pip install lime
Requirement already satisfied: lime in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (0.2.0.1)
Requirement already satisfied: matplotlib in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from lime) (3.9.2)
Requirement already satisfied: numpy in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from lime) (2.0.2)
Requirement already satisfied: scipy in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from lime) (1.14.1)
Requirement already satisfied: tqdm in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from lime) (4.67.1)
Requirement already satisfied: scikit-learn>=0.18 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from lime) (1.5.2)
Requirement already satisfied: scikit-image>=0.12 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from lime) (0.24.0)
Requirement already satisfied: networkx>=2.8 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-image>=0.12->lime) (3.4.2)
Requirement already satisfied: pillow>=9.1 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-image>=0.12->lime) (10.4.0)
Requirement already satisfied: imageio>=2.33 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-image>=0.12->lime) (2.36.0)
Requirement already satisfied: tifffile>=2022.8.12 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-image>=0.12->lime) (2024.9.20)
Requirement already satisfied: packaging>=21 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-image>=0.12->lime) (24.1)
Requirement already satisfied: lazy-loader>=0.4 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-image>=0.12->lime) (0.4)
Requirement already satisfied: joblib>=1.2.0 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn>=0.18->lime) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn>=0.18->lime) (3.5.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->lime) (1.3.0)
Requirement already satisfied: cycler>=0.10 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->lime) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->lime) (4.53.1)
Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->lime) (1.4.5)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->lime) (3.1.4)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->lime) (2.9.0.post0)
Requirement already satisfied: colorama in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from tqdm->lime) (0.4.6)
Requirement already satisfied: six>=1.5 in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from python-dateutil>=2.7->matplotlib->lime) (1.16.0)
[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# Import LIME
import lime.lime_tabular

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machinelearning rajeev/ML TRAIN DATASETS/train_t5_embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of regression models up to Decision Tree
models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('KNN', KNeighborsRegressor()),
    ('Decision Tree', DecisionTreeRegressor())
]

# Function to calculate and return performance metrics
def evaluate_model(model, X, y):
    # Cross-validation with 10 folds
    cv_scores_rmse = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error')
    cv_scores_r2 = cross_val_score(model, X, y, cv=10, scoring='r2')

    # Compute mean and standard deviation of CV scores
    rmse_mean = -cv_scores_rmse.mean()  # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Hyperparameter tuning using GridSearchCV for the selected models
param_grids = {
    'Linear Regression': {},  # No hyperparameters for linear regression
    'Ridge Regression': {'regressor__alpha': [0.1, 1, 10, 100]},  # Note the 'regressor__' prefix
    'Lasso Regression': {'regressor__alpha': [0.1, 1, 10]},  # Note the 'regressor__' prefix
    'KNN': {'regressor__n_neighbors': [3, 5, 10, 15], 'regressor__weights': ['uniform', 'distance']},  # Note the 'regressor__' prefix
    'Decision Tree': {'regressor__max_depth': [None, 5, 10, 20], 'regressor__min_samples_split': [2, 5, 10]}  # Note the 'regressor__' prefix
}

# Perform hyperparameter tuning and evaluation for each model
for name, model in models:
    print(f"\nTraining and hyperparameter tuning for {name}...")

    # Build a pipeline for each model
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize the features
        ('pca', PCA(n_components=0.95)),  # Apply PCA for dimensionality reduction
        ('regressor', model)  # Model
    ])

    # Hyperparameter tuning with GridSearchCV
    param_grid = param_grids.get(name, {})
    
    # Skip models with no hyperparameters to tune
    if param_grid:
        grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        print(f"Best {name} model: {grid_search.best_params_}")
    else:
        pipeline.fit(X_train, y_train)
        best_model = pipeline  # Use the pipeline as the best model

    # Cross-validation after tuning
    rmse_mean, rmse_std, r2_mean, r2_std = evaluate_model(best_model, X_train, y_train)
    print(f"Cross-validation after tuning for {name}:")
    print(f"CV Mean RMSE (after tuning): {rmse_mean}, CV RMSE Std: {rmse_std}")
    print(f"CV Mean R² (after tuning): {r2_mean}, CV R² Std: {r2_std}")

    # Evaluate the model on the test data
    y_pred = best_model.predict(X_test)
    
    # Apply clipping strategy to keep predictions within the 0-5 range
    y_pred_clipped = np.clip(y_pred, 0, 5)
    
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_clipped))
    test_r2 = r2_score(y_test, y_pred_clipped)

    print(f"\nTest RMSE: {test_rmse}")
    print(f"Test R²: {test_r2}")

    # LIME Explanation
    print(f"\nLIME Explanation for {name}:")

    # Convert the training data to numpy arrays for LIME
    X_train_np = X_train.to_numpy() if isinstance(X_train, pd.DataFrame) else X_train
    feature_names = X.columns.tolist()

    # Create a LIME explainer instance
    explainer = lime.lime_tabular.LimeTabularExplainer(
        X_train_np, 
        training_labels=y_train.to_numpy(),
        mode='regression',
        feature_names=feature_names,
        verbose=True
    )

    # Explain the first instance in the test set
    lime_exp = explainer.explain_instance(X_test.iloc[0].to_numpy(), best_model.predict, num_features=5)
    lime_exp.show_in_notebook()
Training and hyperparameter tuning for Linear Regression...
Cross-validation after tuning for Linear Regression:
CV Mean RMSE (after tuning): 0.29643360817884284, CV RMSE Std: 0.05338130585877468
CV Mean R² (after tuning): 0.8757317869351426, CV R² Std: 0.020583447413062995

Test RMSE: 0.5289088981538131
Test R²: 0.8901144203318311

LIME Explanation for Linear Regression:
C:\Users\dhanu\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names
  warnings.warn(
Intercept 2.694298070314206
Prediction_local [3.09354252]
Right: 4.837937298579014
Training and hyperparameter tuning for Ridge Regression...
Best Ridge Regression model: {'regressor__alpha': 100}
Cross-validation after tuning for Ridge Regression:
CV Mean RMSE (after tuning): 0.29233334123603, CV RMSE Std: 0.053173754478938955
CV Mean R² (after tuning): 0.877478808221376, CV R² Std: 0.020353105439757827

Test RMSE: 0.5277177423867241
Test R²: 0.8906088096406924

LIME Explanation for Ridge Regression:
C:\Users\dhanu\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names
  warnings.warn(
Intercept 2.6400669587558934
Prediction_local [3.19352113]
Right: 4.848054997023594
Training and hyperparameter tuning for Lasso Regression...
Best Lasso Regression model: {'regressor__alpha': 0.1}
Cross-validation after tuning for Lasso Regression:
CV Mean RMSE (after tuning): 0.47471008407376114, CV RMSE Std: 0.07811254918443654
CV Mean R² (after tuning): 0.8011821415065745, CV R² Std: 0.028978128431378893

Test RMSE: 0.6908028328724246
Test R²: 0.812549252632241

LIME Explanation for Lasso Regression:
C:\Users\dhanu\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names
  warnings.warn(
Intercept 2.730877464178246
Prediction_local [2.93101529]
Right: 5.075368161590885
Training and hyperparameter tuning for KNN...
Best KNN model: {'regressor__n_neighbors': 3, 'regressor__weights': 'distance'}
Cross-validation after tuning for KNN:
CV Mean RMSE (after tuning): 0.41175991957459557, CV RMSE Std: 0.09491977354924565
CV Mean R² (after tuning): 0.8263382304760768, CV R² Std: 0.04224531935297672

Test RMSE: 0.6675267403320166
Test R²: 0.8249684688411176

LIME Explanation for KNN:
C:\Users\dhanu\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names
  warnings.warn(
Intercept 3.290188633943761
Prediction_local [3.74078172]
Right: 5.000000000000001
Training and hyperparameter tuning for Decision Tree...
Best Decision Tree model: {'regressor__max_depth': 20, 'regressor__min_samples_split': 5}
Cross-validation after tuning for Decision Tree:
CV Mean RMSE (after tuning): 0.858903676269741, CV RMSE Std: 0.19255571892232373
CV Mean R² (after tuning): 0.6031250925037617, CV R² Std: 0.10214507406332363

Test RMSE: 0.8003659427520516
Test R²: 0.7483736273732797

LIME Explanation for Decision Tree:
C:\Users\dhanu\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names
  warnings.warn(
Intercept 4.096315735187511
Prediction_local [3.93953366]
Right: 5.0
In [6]:
!pip install xgboost
Requirement already satisfied: xgboost in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (2.1.3)
Requirement already satisfied: numpy in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from xgboost) (2.0.2)
Requirement already satisfied: scipy in c:\users\dhanu\appdata\local\programs\python\python312\lib\site-packages (from xgboost) (1.14.1)
[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
In [7]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
import numpy as np
import lime.lime_tabular

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machinelearning rajeev/ML TRAIN DATASETS/train_t5_embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Scale the target variable (y) to range [0, 5] using MinMaxScaler
scaler_y = MinMaxScaler(feature_range=(0, 5))
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

# Standardize the features
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.99)  # Preserve 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_scaled, test_size=0.2, random_state=42)

# XGBoost model
model = XGBRegressor(random_state=42, n_jobs=-1)

# Hyperparameter tuning grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0]
}

# Hyperparameter tuning
print(f"Training and hyperparameter tuning for XGBoost...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best XGBoost model: {grid_search.best_params_}")

# Evaluate the model on the test data
y_pred_scaled = best_model.predict(X_test)

# Clip predictions to the range [0, 5] to ensure valid outputs
y_pred_scaled_clipped = np.clip(y_pred_scaled, 0, 5)

# Evaluate performance metrics on the scaled target
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_scaled_clipped))
test_r2 = r2_score(y_test, y_pred_scaled_clipped)

print(f"\nTest RMSE (scaled): {test_rmse}")
print(f"Test R² (scaled): {test_r2}")

# Rescale the predictions and test target back to original range for final evaluation
y_pred_original = scaler_y.inverse_transform(y_pred_scaled_clipped.reshape(-1, 1)).flatten()
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

original_rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
original_r2 = r2_score(y_test_original, y_pred_original)

print(f"\nTest RMSE (original): {original_rmse}")
print(f"Test R² (original): {original_r2}")

# LIME Explanation
print(f"\nLIME Explanation for XGBoost:")
explainer = lime.lime_tabular.LimeTabularExplainer(
    X_train, 
    training_labels=y_train, 
    mode='regression', 
    verbose=True, 
    feature_names=[f'PCA_{i+1}' for i in range(X_train.shape[1])], 
    feature_selection='auto'
)
lime_exp = explainer.explain_instance(X_test[0], best_model.predict, num_features=5)
lime_exp.show_in_notebook()
Training and hyperparameter tuning for XGBoost...
Best XGBoost model: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}

Test RMSE (scaled): 0.5386805361452687
Test R² (scaled): 0.8860166218904774

Test RMSE (original): 0.5386805361452687
Test R² (original): 0.8860166218904774

LIME Explanation for XGBoost:
Intercept 2.041903063041338
Prediction_local [4.48649382]
Right: 5.025804
In [8]:
import pandas as pd
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
import numpy as np
import lime.lime_tabular

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machinelearning rajeev/ML TRAIN DATASETS/train_t5_embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Scale the target variable (y) to range [0, 5] using MinMaxScaler
scaler_y = MinMaxScaler(feature_range=(0, 5))
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

# Standardize the features
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.99)  # Preserve 99% of variance
X_pca = pca.fit_transform(X_scaled)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_scaled, test_size=0.2, random_state=42)

# AdaBoost model
model = AdaBoostRegressor(random_state=42)

# Hyperparameter tuning grid
param_grid = {
    'n_estimators': [50, 100],             # Number of boosting rounds
    'learning_rate': [0.01, 0.1],         # Learning rate
    'loss': ['linear', 'square']           # Loss function options
}

# Hyperparameter tuning
print(f"Training and hyperparameter tuning for AdaBoost...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best AdaBoost model: {grid_search.best_params_}")

# Evaluate the model on the test data
y_pred_scaled = best_model.predict(X_test)

# Clip predictions to the range [0, 5] to ensure valid outputs
y_pred_scaled_clipped = np.clip(y_pred_scaled, 0, 5)

# Evaluate performance metrics on the scaled target
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_scaled_clipped))
test_r2 = r2_score(y_test, y_pred_scaled_clipped)

print(f"\nTest RMSE (scaled): {test_rmse}")
print(f"Test R² (scaled): {test_r2}")

# Rescale the predictions and test target back to original range for final evaluation
y_pred_original = scaler_y.inverse_transform(y_pred_scaled_clipped.reshape(-1, 1)).flatten()
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

original_rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
original_r2 = r2_score(y_test_original, y_pred_original)

print(f"\nTest RMSE (original): {original_rmse}")
print(f"Test R² (original): {original_r2}")

# LIME Explanation for AdaBoost
print(f"\nLIME Explanation for AdaBoost:")
explainer = lime.lime_tabular.LimeTabularExplainer(
    X_train, 
    training_labels=y_train, 
    mode='regression', 
    verbose=True, 
    feature_names=[f'PCA_{i+1}' for i in range(X_train.shape[1])], 
    feature_selection='auto'
)
lime_exp = explainer.explain_instance(X_test[0], best_model.predict, num_features=5)
lime_exp.show_in_notebook()
Training and hyperparameter tuning for AdaBoost...
Best AdaBoost model: {'learning_rate': 0.1, 'loss': 'square', 'n_estimators': 100}

Test RMSE (scaled): 1.0199944928960172
Test R² (scaled): 0.5913282566763101

Test RMSE (original): 1.0199944928960172
Test R² (original): 0.5913282566763101

LIME Explanation for AdaBoost:
Intercept 2.3758016834849527
Prediction_local [4.07971416]
Right: 3.8797909407665507
In [9]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
import numpy as np
import lime.lime_tabular

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machinelearning rajeev/ML TRAIN DATASETS/train_t5_embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Scale the target variable (y) to range [0, 5] using MinMaxScaler
scaler_y = MinMaxScaler(feature_range=(0, 5))
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

# Standardize the features
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.99)  # Preserve 99% of variance
X_pca = pca.fit_transform(X_scaled)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_scaled, test_size=0.2, random_state=42)

# Gradient Boosting model
model = GradientBoostingRegressor(random_state=42)

# Hyperparameter tuning grid
param_grid = {
    'n_estimators': [50, 100],             # Number of boosting rounds
    'learning_rate': [0.01, 0.1],         # Learning rate
    'max_depth': [3, 5],                   # Maximum depth of trees
    'subsample': [0.8, 1.0]                # Fraction of samples for each tree
}

# Hyperparameter tuning
print(f"Training and hyperparameter tuning for Gradient Boosting...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best Gradient Boosting model: {grid_search.best_params_}")

# Evaluate the model on the test data
y_pred_scaled = best_model.predict(X_test)

# Clip predictions to the range [0, 5] to ensure valid outputs
y_pred_scaled_clipped = np.clip(y_pred_scaled, 0, 5)

# Evaluate performance metrics on the scaled target
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_scaled_clipped))
test_r2 = r2_score(y_test, y_pred_scaled_clipped)

print(f"\nTest RMSE (scaled): {test_rmse}")
print(f"Test R² (scaled): {test_r2}")

# Rescale the predictions and test target back to original range for final evaluation
y_pred_original = scaler_y.inverse_transform(y_pred_scaled_clipped.reshape(-1, 1)).flatten()
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

original_rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
original_r2 = r2_score(y_test_original, y_pred_original)

print(f"\nTest RMSE (original): {original_rmse}")
print(f"Test R² (original): {original_r2}")

# LIME Explanation for Gradient Boosting
print(f"\nLIME Explanation for Gradient Boosting:")
explainer = lime.lime_tabular.LimeTabularExplainer(
    X_train, 
    training_labels=y_train, 
    mode='regression', 
    verbose=True, 
    feature_names=[f'PCA_{i+1}' for i in range(X_train.shape[1])], 
    feature_selection='auto'
)
lime_exp = explainer.explain_instance(X_test[0], best_model.predict, num_features=5)
lime_exp.show_in_notebook()
Training and hyperparameter tuning for Gradient Boosting...
Best Gradient Boosting model: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}

Test RMSE (scaled): 0.5817735263264968
Test R² (scaled): 0.8670504517339344

Test RMSE (original): 0.5817735263264968
Test R² (original): 0.8670504517339344

LIME Explanation for Gradient Boosting:
Intercept 2.1464994848153753
Prediction_local [4.45243761]
Right: 5.001019534425308
In [ ]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
import numpy as np
import lime.lime_tabular

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machinelearning rajeev/ML TRAIN DATASETS/train_t5_embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Scale the target variable (y) to range [0, 5] using MinMaxScaler
scaler_y = MinMaxScaler(feature_range=(0, 5))
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

# Standardize the features
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.99)  # Preserve 99% of variance
X_pca = pca.fit_transform(X_scaled)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_scaled, test_size=0.2, random_state=42)

# Random Forest model
model = RandomForestRegressor(random_state=42)

# Hyperparameter tuning grid
param_grid = {
    'n_estimators': [50, 100],             # Number of trees in the forest
    'max_depth': [3, 5, 10],               # Maximum depth of trees
    'min_samples_split': [2, 5],           # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2],            # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]             # Whether bootstrap samples are used when building trees
}

# Hyperparameter tuning
print(f"Training and hyperparameter tuning for Random Forest...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best Random Forest model: {grid_search.best_params_}")

# Cross-validation performance on the training set for RMSE and R²
train_cv_r2_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2')
train_cv_rmse_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Mean and Standard Deviation for R² and RMSE
train_cv_mean_r2 = train_cv_r2_scores.mean()
train_cv_std_r2 = train_cv_r2_scores.std()

train_cv_mean_rmse = np.sqrt(-train_cv_rmse_scores.mean())  # RMSE is negative, so negate to make it positive
train_cv_std_rmse = np.std(np.sqrt(-train_cv_rmse_scores))

# Output the cross-validation metrics on the training set
print(f"\nTraining CV Mean R²: {train_cv_mean_r2}")
print(f"Training CV Std R²: {train_cv_std_r2}")
print(f"Training CV Mean RMSE: {train_cv_mean_rmse}")
print(f"Training CV Std RMSE: {train_cv_std_rmse}")

# Evaluate the model on the test data
y_pred_scaled = best_model.predict(X_test)

# Clip predictions to the range [0, 5] to ensure valid outputs
y_pred_scaled_clipped = np.clip(y_pred_scaled, 0, 5)

# Evaluate performance metrics on the scaled target
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_scaled_clipped))
test_r2 = r2_score(y_test, y_pred_scaled_clipped)

print(f"\nTest RMSE (scaled): {test_rmse}")
print(f"Test R² (scaled): {test_r2}")

# Rescale the predictions and test target back to original range for final evaluation
y_pred_original = scaler_y.inverse_transform(y_pred_scaled_clipped.reshape(-1, 1)).flatten()
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

original_rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
original_r2 = r2_score(y_test_original, y_pred_original)

print(f"\nTest RMSE (original): {original_rmse}")
print(f"Test R² (original): {original_r2}")

# LIME Explanation for Random Forest
print(f"\nLIME Explanation for Random Forest:")
explainer = lime.lime_tabular.LimeTabularExplainer(
    X_train, 
    training_labels=y_train, 
    mode='regression', 
    verbose=True, 
    feature_names=[f'PCA_{i+1}' for i in range(X_train.shape[1])], 
    feature_selection='auto'
)
lime_exp = explainer.explain_instance(X_test[0], best_model.predict, num_features=5)
lime_exp.show_in_notebook()
Training and hyperparameter tuning for Random Forest...
In [ ]: